# params
CUTOFF <- 4 # numer of reviews cutoff
#STORE_TYPE <- "Casey's General Store"
#STORE_TYPE <- "SUBWAY®Restaurants"

Read in all reviews and merge in metadata

all.review.files <- list.files("../data/reviews/")

all.reviews = map(all.review.files, function(x) {
    read_feather(paste0("../data/reviews/", x)) %>%
    mutate(city = unlist(str_split(x, "_"))[1])}) %>%
  bind_rows() 

all.reviews.clean = all.reviews %>%
    filter(!is.na(reviews)) %>%
    filter(reviews != "") %>%
    select(city, place.id, reviews, author.names) %>%
    mutate_all(funs(as.factor))

all.place_id.files <- list.files("../data/place_ids/")
all.place_id.meta = map(all.place_id.files, function(x) {
    read_feather(paste0("../data/place_ids/", x))}) %>%
  bind_rows() %>%
  rename(city = location)

all.place_id.meta.clean = all.place_id.meta %>%
              select(place.id, city, types, name) %>%
              mutate_all(funs(as.factor)) %>%
              distinct(place.id, city, types, name) # some place ids are duplicated across cities
            
# merge on place id
d = left_join(all.reviews.clean, all.place_id.meta.clean) %>%
      mutate(review_num = 1:nrow(.)) %>%
      select(review_num, city, place.id, types, city, name, reviews, author.names) 
      #filter(name == STORE_TYPE)

There are 24469 in total.

Distribution of store types

all.place_id.meta.clean %>%
  group_by(name, city) %>%
  slice(1)%>%
  group_by(name)%>%
  summarize(n = n()) %>%
  arrange(-n)

Distribution of number of reviews by city

What does the distribution of cities look like across cities?

n.place.per.city = d %>%
  group_by(city, name) %>%
  slice(1) %>%
  group_by(city) %>%
  summarize(n = n())

ggplot(n.place.per.city, aes(x = n)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("# place with reviews per city")

# ggplot(n.place.per.city, aes(x = "", y = n)) +
#   geom_boxplot() +
#   theme_bw() +
#   ggtitle("# place with reviews per city") +
#   xlab("")

big.cities = n.place.per.city  %>%
  filter(n > CUTOFF) 

There are 546 cities with at least 5 reviews.

Geographical distribution

big.cities = left_join(big.cities, 
                       all.place_id.meta %>% select(city, lat, lon) %>% 
                         group_by(city) %>% slice(1))

iowa_bounding_box = c(-96.6397171020508, 40.3755989074707, # southwest coordinates
                      -90.1400604248047, 43.5011367797852) # northeast coordinates

map <- get_stamenmap(iowa_bounding_box, zoom = 5, maptype = "toner-lite")

ggmap(map) +
  geom_point(aes(x = lon, y = lat, size = log(n)), 
             data = big.cities, alpha = .5, color = "red") +
  theme_bw()

Now we’re only going to include these cities.

d.big = d %>%
  filter(city %in% unique(big.cities$city))

Linguistic Measures by City

We calculate these measure on each review and then get city mean.

Character measures (prop neg, q, cap, emoji)

# reg expression for emojis
emj <- paste(unlist(trimws(emoji(list_emoji(), TRUE))), collapse = "|")

d.big = d.big %>%
  mutate(reviews.clean = trimws(str_replace_all(reviews," "," ")),# remove extra spaces
         reviews.clean = str_replace_all(reviews.clean, "http[[:alnum:]]*", ""), # remove urls  
         reviews.clean = str_replace_all(reviews.clean, "\n|\"", ""),
         reviews.clean = str_replace_all(reviews.clean, emj, "EMOJI"),
         reviews.clean = str_replace_all(reviews.clean, ":\\)|:\\(|:-\\)|:-\\(", "EMOJI")) # remove emjois

d.big = d.big %>%
  rowwise() %>%
  mutate(review.length = str_length(reviews.clean),
         prop.cap = length(unlist(regmatches(reviews.clean, gregexpr("[A-Z]", reviews.clean, perl=TRUE))))/review.length,
         prop.exclam =  sum(gregexpr("[!]", reviews.clean)[[1]] > 0)/review.length,
         prop.emoji = sum(gregexpr("EMOJI", reviews.clean)[[1]] > 0)/review.length)

Distribution of linguistic measures by city

long.d = d.big %>%
  select(city, review.length, prop.cap, prop.exclam, prop.emoji) %>%
  gather(measure, value, 2:5)

city.d = long.d %>%
  group_by(city, measure) %>%
  summarize(mean = mean(value, na.rm = T))

ggplot(city.d %>% filter(measure != "review.length"), aes(x = log(mean))) +
         facet_wrap(~measure, scales = "free") +
  geom_histogram(aes(fill = measure)) +
  theme_bw() +
  theme(legend.position = "none") 

ggplot(city.d %>% filter(measure == "review.length"), aes(x = log(mean))) +
         facet_wrap(~measure, scales = "free") +
  geom_histogram(aes(fill = measure)) +
  theme_bw() +
  theme(legend.position = "none") 

Geographical distribution

city.d.coord = inner_join(ungroup(city.d), 
            all.place_id.meta %>% select(city, lat, lon) %>% group_by(city) %>% slice(1))

ggmap(map) +
  geom_point(aes(x = lon, y = lat, alpha = log(mean)), 
             data = filter(city.d.coord, measure == "review.length"), 
             color = "red") +
  ggtitle("log length") +
  theme_bw()

city.d.coord = city.d.coord %>%
  mutate(log.prop.cap = ifelse(measure == "prop.cap",log(mean), mean ))
is.na(city.d.coord)<-sapply(city.d.coord, is.infinite)

ggmap(map) +
  geom_point(aes(x = lon, y = lat, alpha = log.prop.cap), 
             data = filter(city.d.coord, measure == "prop.cap"), 
             color = "red") +
  ggtitle("log prop capitalization") +
  theme_bw()

ggmap(map) +
  geom_point(aes(x = lon, y = lat, alpha = log(mean)), 
             data = filter(city.d.coord,
                           measure == "prop.exclam" & mean != 0) , 
             color = "red") +
  ggtitle("log prop exclam") +
  theme_bw()

ggmap(map) +
  geom_point(aes(x = lon, y = lat, alpha = log(mean)), 
             data = filter(city.d.coord,
                           measure == "prop.emoji" & mean != 0) , 
             color = "red") +
  ggtitle("log prop emoji") +
  theme_bw()

Corpus-wide measures (across city)

Sentiment

Create corpus for each city

d.tokenized <- d.big %>%
  ungroup() %>%
  mutate(reviews = as.character(reviews)) %>%
  unnest_tokens(word, reviews) 
# remove stop words
data(stop_words)

d.tokenized.clean <- d.tokenized %>%
  anti_join(stop_words)

counts = d.tokenized.clean %>%
  count(word, sort = TRUE) %>%
  arrange(n)
  
# remove infrequent words
infrequent.words = d.tokenized.clean %>%
  count(word, sort = TRUE) %>%
  filter(n < 10) 

d.tokenized.clean = d.tokenized.clean %>%
  anti_join(infrequent.words)

affin = d.tokenized.clean %>% 
  inner_join(get_sentiments("afinn")) %>%
  group_by(city) %>%
  summarize(mean = mean(score, na.rm = T)) %>%
  arrange(mean)

# num words per 
numwords = d.tokenized.clean %>%
  group_by(city) %>%
  summarize(n = n()) %>%
  arrange(n)
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- d.tokenized %>%
  group_by(review_num) %>%
  summarize(num_words = n()) %>%
  select(city, review_num, word)

prop.negative = d.tokenized %>%
  semi_join(bingnegative) %>%
  group_by(review_num) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = "review_num") %>%
  mutate(prop.neg = negativewords/num_words) %>%
  ungroup() %>%
  select(review_num, prop.neg)
affin = d.tokenized.clean %>% 
  inner_join(get_sentiments("afinn")) %>%
  group_by(city) %>%
  summarize(affin.mean = mean(score, na.rm = T)) %>%
  arrange(affin.mean)

affin.cord = inner_join(ungroup(affin), 
            all.place_id.meta %>% select(city, lat, lon) %>%
              group_by(city) %>% slice(1))

ggplot(affin.cord, aes(x = affin.mean)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("Sentiment")

ggmap(map) +
  geom_point(aes(x = lon, y = lat, alpha = affin.mean), 
             data = affin.cord, 
             color = "red") +
  ggtitle("Sentiment") +
  theme_bw()

Lexical diversity

city.corpus = d.tokenized.clean %>%
  group_by(city) %>%
  summarise(text=paste(word, collapse=" "))

docs =  corpus(city.corpus, 
               doc_id = "city", text_field = "text")

dfm_docs = dfm(docs, groups = "city")
unigram.diversity = textstat_lexdiv(dfm_docs) %>%
                    rownames_to_column() %>%
                    rename(city = rowname)

Sampling

1:1000
##    [1]    1    2    3    4    5    6    7    8    9   10   11   12   13
##   [14]   14   15   16   17   18   19   20   21   22   23   24   25   26
##   [27]   27   28   29   30   31   32   33   34   35   36   37   38   39
##   [40]   40   41   42   43   44   45   46   47   48   49   50   51   52
##   [53]   53   54   55   56   57   58   59   60   61   62   63   64   65
##   [66]   66   67   68   69   70   71   72   73   74   75   76   77   78
##   [79]   79   80   81   82   83   84   85   86   87   88   89   90   91
##   [92]   92   93   94   95   96   97   98   99  100  101  102  103  104
##  [105]  105  106  107  108  109  110  111  112  113  114  115  116  117
##  [118]  118  119  120  121  122  123  124  125  126  127  128  129  130
##  [131]  131  132  133  134  135  136  137  138  139  140  141  142  143
##  [144]  144  145  146  147  148  149  150  151  152  153  154  155  156
##  [157]  157  158  159  160  161  162  163  164  165  166  167  168  169
##  [170]  170  171  172  173  174  175  176  177  178  179  180  181  182
##  [183]  183  184  185  186  187  188  189  190  191  192  193  194  195
##  [196]  196  197  198  199  200  201  202  203  204  205  206  207  208
##  [209]  209  210  211  212  213  214  215  216  217  218  219  220  221
##  [222]  222  223  224  225  226  227  228  229  230  231  232  233  234
##  [235]  235  236  237  238  239  240  241  242  243  244  245  246  247
##  [248]  248  249  250  251  252  253  254  255  256  257  258  259  260
##  [261]  261  262  263  264  265  266  267  268  269  270  271  272  273
##  [274]  274  275  276  277  278  279  280  281  282  283  284  285  286
##  [287]  287  288  289  290  291  292  293  294  295  296  297  298  299
##  [300]  300  301  302  303  304  305  306  307  308  309  310  311  312
##  [313]  313  314  315  316  317  318  319  320  321  322  323  324  325
##  [326]  326  327  328  329  330  331  332  333  334  335  336  337  338
##  [339]  339  340  341  342  343  344  345  346  347  348  349  350  351
##  [352]  352  353  354  355  356  357  358  359  360  361  362  363  364
##  [365]  365  366  367  368  369  370  371  372  373  374  375  376  377
##  [378]  378  379  380  381  382  383  384  385  386  387  388  389  390
##  [391]  391  392  393  394  395  396  397  398  399  400  401  402  403
##  [404]  404  405  406  407  408  409  410  411  412  413  414  415  416
##  [417]  417  418  419  420  421  422  423  424  425  426  427  428  429
##  [430]  430  431  432  433  434  435  436  437  438  439  440  441  442
##  [443]  443  444  445  446  447  448  449  450  451  452  453  454  455
##  [456]  456  457  458  459  460  461  462  463  464  465  466  467  468
##  [469]  469  470  471  472  473  474  475  476  477  478  479  480  481
##  [482]  482  483  484  485  486  487  488  489  490  491  492  493  494
##  [495]  495  496  497  498  499  500  501  502  503  504  505  506  507
##  [508]  508  509  510  511  512  513  514  515  516  517  518  519  520
##  [521]  521  522  523  524  525  526  527  528  529  530  531  532  533
##  [534]  534  535  536  537  538  539  540  541  542  543  544  545  546
##  [547]  547  548  549  550  551  552  553  554  555  556  557  558  559
##  [560]  560  561  562  563  564  565  566  567  568  569  570  571  572
##  [573]  573  574  575  576  577  578  579  580  581  582  583  584  585
##  [586]  586  587  588  589  590  591  592  593  594  595  596  597  598
##  [599]  599  600  601  602  603  604  605  606  607  608  609  610  611
##  [612]  612  613  614  615  616  617  618  619  620  621  622  623  624
##  [625]  625  626  627  628  629  630  631  632  633  634  635  636  637
##  [638]  638  639  640  641  642  643  644  645  646  647  648  649  650
##  [651]  651  652  653  654  655  656  657  658  659  660  661  662  663
##  [664]  664  665  666  667  668  669  670  671  672  673  674  675  676
##  [677]  677  678  679  680  681  682  683  684  685  686  687  688  689
##  [690]  690  691  692  693  694  695  696  697  698  699  700  701  702
##  [703]  703  704  705  706  707  708  709  710  711  712  713  714  715
##  [716]  716  717  718  719  720  721  722  723  724  725  726  727  728
##  [729]  729  730  731  732  733  734  735  736  737  738  739  740  741
##  [742]  742  743  744  745  746  747  748  749  750  751  752  753  754
##  [755]  755  756  757  758  759  760  761  762  763  764  765  766  767
##  [768]  768  769  770  771  772  773  774  775  776  777  778  779  780
##  [781]  781  782  783  784  785  786  787  788  789  790  791  792  793
##  [794]  794  795  796  797  798  799  800  801  802  803  804  805  806
##  [807]  807  808  809  810  811  812  813  814  815  816  817  818  819
##  [820]  820  821  822  823  824  825  826  827  828  829  830  831  832
##  [833]  833  834  835  836  837  838  839  840  841  842  843  844  845
##  [846]  846  847  848  849  850  851  852  853  854  855  856  857  858
##  [859]  859  860  861  862  863  864  865  866  867  868  869  870  871
##  [872]  872  873  874  875  876  877  878  879  880  881  882  883  884
##  [885]  885  886  887  888  889  890  891  892  893  894  895  896  897
##  [898]  898  899  900  901  902  903  904  905  906  907  908  909  910
##  [911]  911  912  913  914  915  916  917  918  919  920  921  922  923
##  [924]  924  925  926  927  928  929  930  931  932  933  934  935  936
##  [937]  937  938  939  940  941  942  943  944  945  946  947  948  949
##  [950]  950  951  952  953  954  955  956  957  958  959  960  961  962
##  [963]  963  964  965  966  967  968  969  970  971  972  973  974  975
##  [976]  976  977  978  979  980  981  982  983  984  985  986  987  988
##  [989]  989  990  991  992  993  994  995  996  997  998  999 1000
num.words.per.city = d.tokenized.clean %>%
  group_by(city) %>%
  summarize(n = n()) %>%
  arrange(-n)

## mapy do walmart?
unigram.diversity =   left_join(unigram.diversity, 
                                all.place_id.meta %>% select(city, lat, lon) %>% group_by(city) %>% slice(1))

ggmap(map) +
  geom_point(aes(x = lon, y = lat, alpha = TTR), 
             data = unigram.diversity, 
             color = "red") +
  ggtitle("TTR") +
  theme_bw()

N-grams

bigrams <- d.big %>%
  select(city, review_num, reviews) %>%
  group_by(city, reviews) %>%
  unnest_tokens(bigram, reviews, token = "ngrams", n = 2)

unigrams <- d.big %>%
  select(city, review_num, reviews) %>%
  ungroup() %>%
  mutate(reviews = as.character(reviews))%>%
  unnest_tokens(unigram, reviews)

num_unigrams = unigrams %>%
      ungroup() %>%
      group_by(city, unigram) %>%
      count(unigram, sort = TRUE) %>%
      group_by(city) %>%
      summarize(n.unigrams = n())

bigram.diversity = bigrams %>%
  group_by(city) %>%
  summarize(n.bigrams = n()) %>%
  left_join(num_unigrams, by = "city") %>%
  mutate(normalized.bigrams = n.bigrams/n.unigrams)

All variables

all.lang = city.d %>%
  spread(measure, mean) %>%
  left_join(unigram.diversity, by = "city") %>%
  left_join(bigram.diversity, by = "city") %>%  
  left_join(affin, by = "city") %>%
  mutate(log.review.length = log(review.length), 
         log.prop.cap = log(prop.cap), 
         log.prop.exclam = log(prop.exclam), 
         log.prop.emoji = log(prop.emoji)) %>%
  select(-review.length, -prop.cap, -prop.exclam,
         -prop.emoji, -U, -lgV0, -lgeV0)

is.na(all.lang)<-sapply(all.lang, is.infinite)

Outcomes measures

Are linguistic measure correlated with outcomes measures

dist = read_csv("../data/city_highway_distances.csv") %>%
  rename(city = location) %>%
  mutate(log.mindrive = log(mindrive),
         log.time = log(time)) %>%
  select(-mindrive, -mindist, -time)

all = all.lang %>%
  left_join(dist) %>%
  select(-lat, -lon) 

correlate(all[,-1]) %>%
  focus(TTR:log.time, mirror = TRUE) %>%
  shave(upper = TRUE) %>%
  rplot() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

cor.test(all$log.time, all$TTR)
## 
##  Pearson's product-moment correlation
## 
## data:  all$log.time and all$TTR
## t = 7.729, df = 544, p-value = 5.268e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2368956 0.3882203
## sample estimates:
##       cor 
## 0.3145551
ggplot(all, aes(x = log.time, y = TTR)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw()

ggplot(all, aes(x = log.time, y = log.review.length)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw()

ggplot(all, aes(x = TTR, y = log.review.length)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw()

ggplot(all, aes(x = log.time, y = affin.mean)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw()

ggplot(all, aes(x = log.time, y = log.prop.emoji)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw()

Heteroscedasticity ##### TO DO : sample rows to get TTR; do subway# MTLD

http://www.vli-journal.org/issues/01.1/issue01.1.10.pdf